## 1. **Determine the Optimal Number of Clusters**:# fviz_nbclust(key_measures_scaled_head, kmeans, method = "wss") # Elbow Methodwssplot(key_measures_rep_sub_sample)
# K-means goodness of fit## The total within-cluster sum of squares (WSS) measures the compactness of the clusters. Lower values indicate better fit.## The more clusters derived, the lower the sum of squares## Trade of between fit and model simplicity?link_model_output(kmeans_result_4, "clusters_4") |>union_all(link_model_output(kmeans_result_5 , "clusters_5")) |>union_all(link_model_output(kmeans_result_6 , "clusters_6")) |>union_all(link_model_output(kmeans_result_7 , "clusters_7")) |>union_all(link_model_output(kmeans_result_8 , "clusters_8")) |>union_all(link_model_output(kmeans_result_9 , "clusters_9")) |>union_all(link_model_output(kmeans_result_10, "clusters_10")) |>union_all(link_model_output(kmeans_result_11, "clusters_11")) |>union_all(link_model_output(kmeans_result_12, "clusters_12")) |>mutate(rn =row_number()) |>ggplot(aes(x =reorder(id, rn), y = tot.withinss)) +geom_col(width =0.01) +geom_point(size =5) +scale_y_continuous(labels = scales::comma) +theme_minimal() +theme(axis.text.x =element_text(angle =90, vjust =0.5)) +labs(x ="n_clusters",y ="Total within-cluster sum of squares (WSS)",title ="Comparing sum of squares Goodness of Fit measure",subtitle ="K-means cluster models" )
# Perform Principal Component Analysis (PCA)pca_result <-prcomp(key_measures_rep_sub_sample, scale. =TRUE)# Create a dataframe with the principal componentspca_data <-as.data.frame(pca_result$x) |>mutate(cluster =as.factor(kmeans_result_6$cluster)) # Add cluster assignments to the dataframe# Create pairwise scatter plot matrixggpairs_plot <-ggpairs(pca_data, aes(color = cluster, alpha =0.5))
#ggpairs_plot
3D plot of top 3 principle components
# Create 3D plotplot_ly(pca_data_3d, x =~PC1, y =~PC2, z =~PC3, color =~cluster#colors = c('#1f77b4', '#ff7f0e', '#2ca02c') ) %>%add_markers(size =1) %>%layout(scene =list(xaxis =list(title ='PC1'),yaxis =list(title ='PC2'),zaxis =list(title ='PC3')),title ='3D Plot of Principal Components')
# Isolate cluster vectors from each modelkmeans_clusters <- kmeans_result_6$clusterlpa_clusters <- selected_model_eev_6$classificationdbscan_clusters <- db$cluster# Function to sub-sample datasubsample_data <-function(data, fraction =0.25) {set.seed(123) # For reproducibility sample_indices <-sample(1:nrow(data), size =floor(fraction *nrow(data)))return(data[sample_indices, ])}# Evaluate clustering performance on a sub-sampleevaluate_clustering_subsample <-function(data, clusters, fraction =0.25) {# Sub-sample the data subsample <-subsample_data(data, fraction) subsample_clusters <- clusters[1:nrow(subsample)]# Ensure clusters are numeric subsample_clusters <-as.numeric(subsample_clusters)# Calculate silhouette score dist_matrix <-dist(subsample) sil <-silhouette(subsample_clusters, dist_matrix) silhouette_score <-mean(sil[, 3])# Calculate Davies-Bouldin index db_index <-index.DB(subsample, subsample_clusters)$DB# Calculate within-cluster sum of squares (WCSS) wcss <-sum(kmeans(subsample, centers =length(unique(subsample_clusters)))$withinss)return(list(silhouette_score = silhouette_score, db_index = db_index, wcss = wcss))}# Calculate evaluation metrics for each model using a subsamplekmeans_eval <-evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters)lpa_eval <-evaluate_clustering_subsample(key_measures_rep_sub_sample, lpa_clusters)dbscan_eval <-evaluate_clustering_subsample(key_measures_rep_sub_sample, dbscan_clusters)# Function to identify the optimum modelselect_optimum_model <-function(kmeans_eval, lpa_eval, dbscan_eval) {# Combine evaluation metrics into a data frame eval_metrics <-data.frame(model =c("K-means", "LPA", "DBSCAN"),silhouette_score =c(kmeans_eval$silhouette_score, lpa_eval$silhouette_score, dbscan_eval$silhouette_score),db_index =c(kmeans_eval$db_index, lpa_eval$db_index, dbscan_eval$db_index),wcss =c(kmeans_eval$wcss, lpa_eval$wcss, dbscan_eval$wcss) )# Normalize the metrics for comparison (higher silhouette score is better, lower DB index and WCSS are better) eval_metrics$normalized_silhouette <- eval_metrics$silhouette_score /max(eval_metrics$silhouette_score) eval_metrics$normalized_db_index <-min(eval_metrics$db_index) / eval_metrics$db_index eval_metrics$normalized_wcss <-min(eval_metrics$wcss) / eval_metrics$wcss# Calculate a combined score (you can adjust the weights as needed) eval_metrics$combined_score <- eval_metrics$normalized_silhouette + eval_metrics$normalized_db_index + eval_metrics$normalized_wcss# Identify the model with the highest combined score optimum_model <- eval_metrics[which.max(eval_metrics$combined_score), "model"]return(optimum_model)}
# Identify the optimum modeloptimum_model <-select_optimum_model(kmeans_eval, lpa_eval, dbscan_eval)print(paste("The optimum model is:", optimum_model))